
/* 

Paper: Gentrification and pioneer businesses 
Authors: Behrens, Boulam, Martin, Mayneris 

Name dofile: create_db_concordedblocks.do  
Version: 15 nov. 2021 

Output: 
- census_blkgrp.dta 
- naics_id_conc_database_blkgp.dta 

Inputs:

- duns_1990_gis_bg.dta (mapping btw NETS and block groups shape files -- by the authors using GIS,)
- duns_1990_2010_idconc_naics.dta  (from NETS data)
- geo_variables.do (XX) 
- education.dta, housing.dta, income.dta, race.dta (from prepare_blockvar_*.dta datasets, publicly available)
- concord_blkgr9000 (authors computation, publicly available) 


*/ 

set more off

// raw data at the block and blockgroup levels 
use education.dta, clear 
merge 1:1 geoid year using income 
drop _m 
merge 1:1 geoid year using housing 
drop _m 

gen var=substr(geoid,1,9) if year==1990 & length(geoid)<14
gen var2=substr(geoid, 10,4) if year==1990 & length(geoid)<14
replace geoid=var+"00"+var2 if year==1990 & length(geoid)<14 // geoid are 12 or 13 character long instead of 14 or 15 for many blocks in 1990. We have a procedure to correct for that
drop var var2

merge 1:1 geoid year using geo_variables
replace geoid = substr(geoid,1,12) 

// collapse at the block group level
collapse (sum ) pop edu=edu_college_block (mean) percap_income=percap_income_block median_income=median_income_block  crent=median_crent_block grent=median_grent_block  (mean) intptlat intptlon, by(geoid year) 

// merge with concordance 
drop if year==2010 
merge 1:1 geoid year using concord_blkgr9000 
keep if _m==3 // 187 obs  in concord but not in census 

foreach v of var median_income percap_income crent grent {
bys id_conc year: egen pop_tot=sum(pop) if `v'!=.
gen sh=pop/pop_tot
replace `v'=sh*`v'
drop sh pop_tot
}

// aggregate at the id_conc level (stable geography)
collapse (sum) percap_income  median_income pop edu crent grent (mean) intptlat intptlon   (count) nb_blk=pop , by(id_conc year) 
replace edu=0 if pop==0  
drop if pop==.
g sh_edu=edu/pop 
gen lat_ws=40.706154
gen lon_ws=-74.008794
gen var = acos(sin((intptlat*_pi)/180) * sin((lat_ws*_pi)/180) + cos(abs((intptlon*_pi)/180- (lon_ws*_pi)/180)) * cos(intptlat*_pi/180) * cos(lat_ws*_pi/180)) * 6378.137
bys id_conc: egen dis_ws=mean(var)
drop lat_ws lon_ws var

// drop concorded block groups with more than 2 block groups inside
bys id_conc: egen var=sum(nb_blk)
sum var, det
drop if var>4
drop var

rename percap_income pc
keep id_conc pc sh_edu year pop grent crent dis_ws nb_blk 
sum pop if pop>0,d 
drop if pop<8 
egen _=count(year), by(id_conc) 
keep if _==2 
drop _ 
save census_blkgrp, replace 

// identify gentrifiying block groups
use census_blkgrp, clear 
keep if dis_ws<30
keep sh_edu pc pop id_conc year
reshape wide sh_edu pc pop, i(id_conc) j(year)  

foreach i in 1990 2000 {
local z 10 // # of bins 
xtile pctile_pc`i'=pc`i', n(`z')
xtile pctile_sh_edu`i'=sh_edu`i', n(`z')
}

foreach i in sh_edu pc {
g change_decile_`i'_9000=pctile_`i'2000 - pctile_`i'1990
g gr_`i'_9000=`i'2000/`i'1990 - 1
g dif_`i'_9000=`i'2000 - `i'1990 
}

// def. finale: below 6th pctile and an increase in the share of educated
g gentri_9000=(pctile_pc1990<6 & change_decile_pc_9000>2 & change_decile_pc_9000!=. ) & (change_decile_sh_edu_9000>0)  
g poor=pctile_pc1990<6
keep id_conc gentri_9000 poor poor 
save gentri_blckgrp9000, replace  


// add establishment data 
use duns_1990_2010_idconc_naics, clear
keep if year==1990 
keep year emp dunsnumber 
destring dunsnumber, replace
save tmp, replace 

use duns_1990_gis_bg.dta, clear
merge 1:1 dunsnumber year using tmp 
keep if _m==3 
drop _m 
erase tmp.dta 

tostring naics, replace
count if length(naics)!=6
count if length(naics)!=6&naics!="."

merge m:1 naics using naics5
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

merge m:1 naics using naics4
replace naics=naics02 if _merge==3
drop if _merge==2
drop _merge

gen toto=length(naics)
keep if toto==6
drop toto

egen ind=group(naics)
sum ind //1118 naics

gen nb_plants_sect=1
gen emp_sect=emp

collapse (sum) nb_plants_sect emp_sect, by(naics year id_conc) 
merge m:1 id_conc year using census_blkgrp
keep if year==1990 & dis_ws<30

egen sect=group(naics)
egen _=count(year), by(naics year) 
keep if _>50

keep if _m==3 // some BLKG do not have firms and BLKGP with less than 8 inhabitants are excluded from census_blkgrp
keep nb_plants_sect naics year id_conc emp_sect
egen emp_tot=sum(emp_sect), by(id_conc) 

egen t=group(year)

save naics_id_conc_database_blkgp, replace 

keep emp_tot id_conc year
duplicates drop id_conc year, force
rename emp_tot emp
merge 1:1 id_conc year using census_blkgrp
replace emp=0 if _m==2
drop _m
save census_blkgrp, replace


use census_blkgrp.dta, clear

egen t=group(year)
tsset id_conc t

keep id_conc pc sh_edu year pop dis_ws

reshape wide sh_edu pc pop, i(id_conc dis_ws) j(year)  

keep if dis_ws<30

merge 1:1 id_conc using gentri_blckgrp9000.dta 
drop _m

rename poor poor1990
label var poor1990 "Below median block 1990"

gen pc_gr1990_poor=(log(pc2000)-log(pc1990))*poor1990
gen sh_edu_gr1990_poor=(sh_edu2000-sh_edu1990)*poor1990

label var pc_gr1990_poor "Per cap. inc. growth 1990-2000 x Below median block 1990"
label var sh_edu_gr1990_poor "Share college educated growth 1990-2000 x Below median block 1990"

keep id_conc id_conc pc_gr1990_poor sh_edu_gr1990_poor
save temp.dta, replace

use census_blkgrp.dta, clear
keep if year==1990&dis_ws<30
merge 1:1 id_conc using temp.dta
keep if _m==3
drop _m

merge 1:1 id_conc using gentri_blckgrp9000.dta 
keep if _m==3
drop _m

rename poor poor1990
label var poor1990 "Below median block 1990"

foreach v of var pc pop emp crent grent{
gen l`v'=log(`v')
}

save blkgrp_pop_income.dta, replace




